# Preliminaries ####

## read the file in ####
rm(list = ls())
library(tidyverse)
library(texreg)
library(ROCR)
library(survival)

# data:
d <- read.csv("data_main_aan25_militias_neg.csv")

# na.omit model selection:
d <- d |>
   dplyr::select(
      Negotiation, DyadId,
      Mil_vs_Reb_dm_lag, Mil_vs_Civ_lag, Mil_vs_Gov_lag,
      Mil_vs_Reb_oth_lag,
      Mil_vs_Mil_lag,
      impgdppcln, imppopln, v2x_polyarchy, dydurln,
      Intensity_rate_lag,  lull, brd_d_ln,
      Ethnicincomp, rebno_ct,
      violence_civ_gov_lag_ln, violence_civ_reb_lag_ln,
      PrevMed_5y, UN, Internationalized, odapcln,
      RebTerCont, RebPolWingLeg,  RebExpSup,
      tsln, tsln2, tsln3
   ) |>
   na.omit()

# row names updated:
row.names(d) <- 1:nrow(d)


## m1 full ####
fm1 <- formula(
   Negotiation ~ 
      Mil_vs_Reb_dm_lag  + Mil_vs_Civ_lag + Mil_vs_Gov_lag + 
      impgdppcln + imppopln + v2x_polyarchy + dydurln +
      Intensity_rate_lag +  brd_d_ln  +
      tsln + tsln2 + tsln3)

## m2 full ####
fm2 <- formula(
   Negotiation ~ 
      Mil_vs_Reb_dm_lag  + Mil_vs_Civ_lag + Mil_vs_Gov_lag + 
      Mil_vs_Reb_oth_lag + 
      impgdppcln + imppopln + v2x_polyarchy + dydurln +
      Intensity_rate_lag +  brd_d_ln  +
      rebno_ct + lull + UN + Ethnicincomp +
      tsln + tsln2 + tsln3)


## m3 full ####
fm3 <- formula(
   Negotiation ~ 
      Mil_vs_Reb_dm_lag  + Mil_vs_Civ_lag + Mil_vs_Gov_lag + 
      Mil_vs_Reb_oth_lag + 
      Mil_vs_Mil_lag  +
      impgdppcln + imppopln + v2x_polyarchy + dydurln +
      Intensity_rate_lag +  brd_d_ln  +
      rebno_ct + lull + UN + Ethnicincomp + 
      violence_civ_reb_lag_ln + violence_civ_gov_lag_ln +
      tsln + tsln2 + tsln3)

# run the models ####
## full models ####
m1 <- glm(fm1, data = d, family = binomial(link = "logit"))
m2 <- glm(fm2, data = d, family = binomial(link = "logit"))
m3 <- glm(fm3, data = d, family = binomial(link = "logit"))

screenreg(list(m1, m2, m3))


# formulas for each variable (leave one out)
# create a list of formulas
# first formula is the baseline for cross-validation (model 2)
# second formula is without any militia variable 
# third is all but leave one out

formulas <- list()

# Cross-valiation for Model 2: #### 
formulas[[1]] <- fm2

# baseline variables ####
baseline_vars <- names(m2$coefficients)[-1] # first is dropped because intercept

# second formula: without any militia ####
# first four variables are related to militias. without them: 
formula_variables <- baseline_vars[5:length(baseline_vars)]

# create the temp formula
formula_temp <- paste0("Negotiation ~ ", 
                       paste(formula_variables, collapse = " + "))

# retain it in formulas list
formulas[[2]] <- as.formula(formula_temp)

# name of formulas:
formulas_name <- list()
formulas_name[[1]] <- "Full"
formulas_name[[2]] <- "All Militia Vars"

# loop to create remaining formulas (leave one out ) ####
# last three formulas are not needed (tsln always necessary)
loop_length <- length(baseline_vars) - 3

# loop 
for (i in 1:loop_length){
   # formula variables (all vars minus in the baseline)
   formula_variables <- baseline_vars[-i]
   
   # create the formula
   formula_temp <- paste0("Negotiation ~ ", paste(formula_variables, collapse = " + "))
   
   # get the output out as firmula
   formulas[[i + 2]] <- as.formula(formula_temp)
   
   formulas_name[[i + 2]] <- baseline_vars[i]
}

# formulas are read
formulas
formulas_name
formulas_name <- unlist(formulas_name)


# Sampling numbers ####
# this is indeces for cross-validation
set.seed(42)
# n runs: 2000 
n_runs <- 100

# k_fold
k_fold <- 4

# indeces matrix
indeces <- replicate(
   n_runs,
   expr = sample(1:k_fold, size = nrow(d), replace = T)
)

# main loop: cross validation ####

# output will be this big:
# (items in formulas) x (n_runs [i.e. how many iterations])
# create a list. first level is for each formula
# second level is the iterations (aucs)
output_list <- list()

# an empty display of the output: 
for(frm_i in 1:length(formulas)){
   output_list[[frm_i]] <- rep("auc_holder", n_runs)
}
# output_list <- list()



for (i_iter in 1:n_runs) { # this is over n_runs
   
   # iteration index:
   index <- indeces[, i_iter] 
   
   # estimating the models:
   for(frm_i in 1:length(formulas)){
      
      # model formula:
      fm_x <- formulas[[frm_i]]
      
      # prediction output:
      output_prediction <- rep(NA, nrow(d))
      
      # k_fold part:
      for(gg in 1:k_fold) { # this is for each fold
         
         # train data
         train_data <- d[index != gg,]
         
         # test data
         test_data <- d[index == gg, ]
         
         # run the glm
         est_mx <- glm(
            formula =  fm_x, 
            family  =  binomial(link = "logit"), 
            data    =  train_data
         )
         
         # predict the model using test data
         predict_oos <- predict(
            est_mx, 
            newdata = test_data, 
            type = "response", 
            na.action = na.omit)
         
         # get the output from this level
         output_prediction[as.numeric(names(predict_oos))] <- predict_oos
      } # outside of k_fold loop
      pp <- prediction(output_prediction, d$Negotiation)
      
      # auc
      auc_list   <- performance(pp, measure = "auc")
      auc_scalar <- auc_list@y.values[[1]]
      
      # output: 
      output_list[[frm_i]][i_iter] <-  auc_scalar
   }
   print(paste0(i_iter, " of ", n_runs, " is done." ))
   print(paste0(n_runs - i_iter, " to go."))
}


# turn the output into df
auc_df <- bind_cols(output_list)
colnames(auc_df) <- formulas_name

# numeric
auc_df <- sapply(auc_df, as.numeric)  |> as.data.frame()

# difference:
auc_df <- sapply(auc_df, function(x){auc_df$Full - x}) |> data.frame()

names(auc_df) <- c("Model 2 Full",
                   "All Militia Variables",
                   "Militia vs Rebels",
                   "Militia vs Civilians",
                   "Militia vs Government",
                   "Militia vs Other Rebels",
                   "GDP per capita (ln)",
                   "Population (ln)",
                   "V-Dem Polyarchy",
                   "Duration",
                   "Intensity rate",
                   "Battle-related deaths (ln)",
                   "Number of rebel groups",
                   "Conflict lull",
                   "UN peacekeeping",
                   "Ethnic conflict")

# plot df
p.df <- stack(auc_df)

# mode full drop
p.df <- p.df[!p.df$ind == "Model 2 Full",]

# median
p.df <- p.df |> group_by(ind) |> mutate(median = median(values)) |> ungroup()

# re-order highest median
p.df <- p.df[order(p.df$median, decreasing = T), ]

# add order no:
order_no <- p.df |> select(ind, median) |> unique()

# order:
order_no$order <- rev(1:nrow(order_no))
p.df <- merge(p.df, order_no, all = T, by = "ind")

# re-order based on order no
p.df <- p.df[order(p.df$order, decreasing = F), ]
unique(p.df$ind)
p.df$order <- factor(p.df$order, labels = unique(p.df$ind))

plt <- ggplot(p.df, 
       aes(y = order, x = values)) +
   geom_boxplot(width =  0.5, outlier.size = 0.25, size = 0.9) + 
   xlim(-0.04, 0.04) +
   geom_vline(xintercept = 0, linetype = "dashed", size = 0.7, col = "red") +
   xlab("Difference in AUC") +
   ylab("") +
   theme_minimal(base_size = 20) +
   theme(axis.text = element_text(size=20, color="black"))  +
   theme(plot.margin=grid::unit(c(0,0,0,0), "mm")) 


plt

ggsave("auc_dif2.png",
       plot = plt,
       device = NULL,
       path = NULL,
       scale = 2,
       width = (210-50), 
       height = (297-50)*0.65,
       units = c("mm"),
       dpi = 300,
       limitsize = TRUE,
       bg = "white")


